autogluon겅부

Author

김보람

Published

February 9, 2024

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
#---#
from autogluon.tabular import TabularPredictor
import autogluon.eda.auto as auto
#---#
import warnings
warnings.filterwarnings('ignore')
X = [1.0,3000,2,1]
y = [0,1,0,0]
XX = [2.0, 2800]
yy = [0,1]
df_tr = pd.DataFrame([X,y]).transpose().rename(columns={0:'X', 1:'y'})
df_tr
X y
0 1.0 0.0
1 3000.0 1.0
2 2.0 0.0
3 1.0 0.0
df_tst = pd.DataFrame([XX,yy]).transpose().rename(columns={0:'X', 1:'y'})
df_tst
X y
0 2.0 0.0
1 2800.0 1.0
predictor = TabularPredictor(label='y')
No path specified. Models will be saved in: "AutogluonModels/ag-20240201_083938/"
path = predictor.fit(df_tr)
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240201_083938/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov  2 18:01:13 UTC 2
Disk Space Avail:   598.15 GB / 982.82 GB (60.9%)
Train Data Rows:    4
Train Data Columns: 1
Label Column: y
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [0.0, 1.0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Warning: Updated label_count_threshold from 10 to 1 to avoid cutting too many classes.
Warning: Updated holdout_frac from 0.2 to 0.251 to avoid cutting too many classes.
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    50963.24 MB
    Train Data (Original)  Memory Usage: 0.0 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 1 | ['X']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 1 | ['X']
    0.0s = Fit runtime
    1 features in original data used to generate 1 features in processed data.
    Train Data (Processed) Memory Usage: 0.0 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.251, Train Rows: 3, Val Rows: 1
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
    Warning: Exception caused KNeighborsUnif to fail during training... Skipping this model.
        Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 5
Detailed Traceback:
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1755, in _train_and_save
    y_pred_proba_val = model.predict_proba(X_val)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 931, in predict_proba
    y_pred_proba = self._predict_proba(X=X, **kwargs)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 949, in _predict_proba
    y_pred_proba = self.model.predict_proba(X)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 283, in predict_proba
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 810, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 5
Fitting model: KNeighborsDist ...
    Warning: Exception caused KNeighborsDist to fail during training... Skipping this model.
        Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 5
Detailed Traceback:
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1755, in _train_and_save
    y_pred_proba_val = model.predict_proba(X_val)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 931, in predict_proba
    y_pred_proba = self._predict_proba(X=X, **kwargs)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 949, in _predict_proba
    y_pred_proba = self.model.predict_proba(X)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 286, in predict_proba
    neigh_dist, neigh_ind = self.kneighbors(X)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/neighbors/_base.py", line 810, in kneighbors
    raise ValueError(
ValueError: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 5
Fitting model: LightGBMXT ...
    1.0  = Validation score   (accuracy)
    0.29s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: LightGBM ...
    1.0  = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: RandomForestGini ...
    1.0  = Validation score   (accuracy)
    0.25s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: RandomForestEntr ...
    1.0  = Validation score   (accuracy)
    0.24s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: CatBoost ...
    1.0  = Validation score   (accuracy)
    0.09s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini ...
    1.0  = Validation score   (accuracy)
    0.24s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: ExtraTreesEntr ...
    1.0  = Validation score   (accuracy)
    0.24s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 0: early stopping
    0.0  = Validation score   (accuracy)
    0.08s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: XGBoost ...
    1.0  = Validation score   (accuracy)
    0.04s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: NeuralNetTorch ...
    Warning: Exception caused NeuralNetTorch to fail during training... Skipping this model.
        float division by zero
Detailed Traceback:
Traceback (most recent call last):
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1733, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1684, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, total_resources=total_resources, **model_fit_kwargs)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/core/models/abstract/abstract_model.py", line 829, in fit
    out = self._fit(**kwargs)
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py", line 207, in _fit
    self._train_net(
  File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/autogluon/tabular/models/tabular_nn/torch/tabular_nn_torch.py", line 365, in _train_net
    f"Epoch {epoch} (Update {total_updates}).\t"
ZeroDivisionError: float division by zero
Fitting model: LightGBMLarge ...
    1.0  = Validation score   (accuracy)
    0.28s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    1.0  = Validation score   (accuracy)
    0.24s    = Training   runtime
    0.0s     = Validation runtime
AutoGluon training complete, total runtime = 2.5s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240201_083938/")
predictor.evaluate(df_tst, silent=True)
{'accuracy': 1.0,
 'balanced_accuracy': 1.0,
 'mcc': 1.0,
 'roc_auc': 1.0,
 'f1': 1.0,
 'precision': 1.0,
 'recall': 1.0}
predictr.predict(df_tst)
0    0.0
1    1.0
Name: y, dtype: float64
predictr.leaderboard(silent=True)
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 LightGBMLarge 1.0 0.000750 0.263480 0.000750 0.263480 1 True 10
1 LightGBM 1.0 0.000790 0.269578 0.000790 0.269578 1 True 2
2 CatBoost 1.0 0.000808 0.087998 0.000808 0.087998 1 True 5
3 LightGBMXT 1.0 0.000830 0.266466 0.000830 0.266466 1 True 1
4 XGBoost 1.0 0.001658 0.041625 0.001658 0.041625 1 True 9
5 RandomForestGini 1.0 0.017526 0.241953 0.017526 0.241953 1 True 3
6 ExtraTreesEntr 1.0 0.017991 0.240180 0.017991 0.240180 1 True 7
7 ExtraTreesGini 1.0 0.018317 0.243194 0.018317 0.243194 1 True 6
8 RandomForestEntr 1.0 0.018408 0.241831 0.018408 0.241831 1 True 4
9 WeightedEnsemble_L2 1.0 0.018666 0.497928 0.000349 0.254734 2 True 11
10 NeuralNetFastAI 0.0 0.003675 0.084893 0.003675 0.084893 1 True 8

오토글루온에서 적합되는 저 모델들을.. 하나하나 model명을 써주고 그 모델에 맞춰 정리한 yy값을 가지고 …………………….. df형식으로 결과값 정리가 필요!

path.path
'AutogluonModels/ag-20240201_082455/'
path.fit_summary()
*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0            LightGBMXT        1.0       0.000718  0.285295                0.000718           0.285295            1       True          1
1              LightGBM        1.0       0.000762  0.281390                0.000762           0.281390            1       True          2
2         LightGBMLarge        1.0       0.000781  0.275127                0.000781           0.275127            1       True         10
3              CatBoost        1.0       0.000834  0.086406                0.000834           0.086406            1       True          5
4               XGBoost        1.0       0.001617  0.040825                0.001617           0.040825            1       True          9
5      RandomForestEntr        1.0       0.017598  0.241371                0.017598           0.241371            1       True          4
6        ExtraTreesEntr        1.0       0.018220  0.240957                0.018220           0.240957            1       True          7
7        ExtraTreesGini        1.0       0.018498  0.236784                0.018498           0.236784            1       True          6
8   WeightedEnsemble_L2        1.0       0.018856  0.488569                0.000358           0.251786            2       True         11
9      RandomForestGini        1.0       0.019265  0.241472                0.019265           0.241472            1       True          3
10      NeuralNetFastAI        0.0       0.003558  0.081848                0.003558           0.081848            1       True          8
Number of models trained: 11
Types of models trained:
{'RFModel', 'CatBoostModel', 'XTModel', 'NNFastAiTabularModel', 'WeightedEnsembleModel', 'LGBModel', 'XGBoostModel'}
Bagging used: False 
Multi-layer stack-ensembling used: False 
Feature Metadata (Processed):
(raw dtype, special dtypes):
('float', []) : 1 | ['X']
*** End of fit() summary ***
{'model_types': {'LightGBMXT': 'LGBModel',
  'LightGBM': 'LGBModel',
  'RandomForestGini': 'RFModel',
  'RandomForestEntr': 'RFModel',
  'CatBoost': 'CatBoostModel',
  'ExtraTreesGini': 'XTModel',
  'ExtraTreesEntr': 'XTModel',
  'NeuralNetFastAI': 'NNFastAiTabularModel',
  'XGBoost': 'XGBoostModel',
  'LightGBMLarge': 'LGBModel',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'LightGBMXT': 1.0,
  'LightGBM': 1.0,
  'RandomForestGini': 1.0,
  'RandomForestEntr': 1.0,
  'CatBoost': 1.0,
  'ExtraTreesGini': 1.0,
  'ExtraTreesEntr': 1.0,
  'NeuralNetFastAI': 0.0,
  'XGBoost': 1.0,
  'LightGBMLarge': 1.0,
  'WeightedEnsemble_L2': 1.0},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'LightGBMXT': 'AutogluonModels/ag-20240201_082455/models/LightGBMXT/',
  'LightGBM': 'AutogluonModels/ag-20240201_082455/models/LightGBM/',
  'RandomForestGini': 'AutogluonModels/ag-20240201_082455/models/RandomForestGini/',
  'RandomForestEntr': 'AutogluonModels/ag-20240201_082455/models/RandomForestEntr/',
  'CatBoost': 'AutogluonModels/ag-20240201_082455/models/CatBoost/',
  'ExtraTreesGini': 'AutogluonModels/ag-20240201_082455/models/ExtraTreesGini/',
  'ExtraTreesEntr': 'AutogluonModels/ag-20240201_082455/models/ExtraTreesEntr/',
  'NeuralNetFastAI': 'AutogluonModels/ag-20240201_082455/models/NeuralNetFastAI/',
  'XGBoost': 'AutogluonModels/ag-20240201_082455/models/XGBoost/',
  'LightGBMLarge': 'AutogluonModels/ag-20240201_082455/models/LightGBMLarge/',
  'WeightedEnsemble_L2': 'AutogluonModels/ag-20240201_082455/models/WeightedEnsemble_L2/'},
 'model_fit_times': {'LightGBMXT': 0.2852945327758789,
  'LightGBM': 0.2813901901245117,
  'RandomForestGini': 0.2414722442626953,
  'RandomForestEntr': 0.24137091636657715,
  'CatBoost': 0.08640623092651367,
  'ExtraTreesGini': 0.23678374290466309,
  'ExtraTreesEntr': 0.24095678329467773,
  'NeuralNetFastAI': 0.08184814453125,
  'XGBoost': 0.04082489013671875,
  'LightGBMLarge': 0.2751274108886719,
  'WeightedEnsemble_L2': 0.2517857551574707},
 'model_pred_times': {'LightGBMXT': 0.0007178783416748047,
  'LightGBM': 0.0007622241973876953,
  'RandomForestGini': 0.019264698028564453,
  'RandomForestEntr': 0.01759791374206543,
  'CatBoost': 0.0008337497711181641,
  'ExtraTreesGini': 0.01849818229675293,
  'ExtraTreesEntr': 0.018220186233520508,
  'NeuralNetFastAI': 0.0035576820373535156,
  'XGBoost': 0.0016171932220458984,
  'LightGBMLarge': 0.0007805824279785156,
  'WeightedEnsemble_L2': 0.0003581047058105469},
 'num_bag_folds': 0,
 'max_stack_level': 2,
 'num_classes': 2,
 'model_hyperparams': {'LightGBMXT': {'learning_rate': 0.05,
   'extra_trees': True},
  'LightGBM': {'learning_rate': 0.05},
  'RandomForestGini': {'n_estimators': 300,
   'max_leaf_nodes': 15000,
   'n_jobs': -1,
   'random_state': 0,
   'bootstrap': True,
   'criterion': 'gini'},
  'RandomForestEntr': {'n_estimators': 300,
   'max_leaf_nodes': 15000,
   'n_jobs': -1,
   'random_state': 0,
   'bootstrap': True,
   'criterion': 'entropy'},
  'CatBoost': {'iterations': 10000,
   'learning_rate': 0.05,
   'random_seed': 0,
   'allow_writing_files': False,
   'eval_metric': 'Accuracy'},
  'ExtraTreesGini': {'n_estimators': 300,
   'max_leaf_nodes': 15000,
   'n_jobs': -1,
   'random_state': 0,
   'bootstrap': True,
   'criterion': 'gini'},
  'ExtraTreesEntr': {'n_estimators': 300,
   'max_leaf_nodes': 15000,
   'n_jobs': -1,
   'random_state': 0,
   'bootstrap': True,
   'criterion': 'entropy'},
  'NeuralNetFastAI': {'layers': None,
   'emb_drop': 0.1,
   'ps': 0.1,
   'bs': 'auto',
   'lr': 0.01,
   'epochs': 'auto',
   'early.stopping.min_delta': 0.0001,
   'early.stopping.patience': 20,
   'smoothing': 0.0},
  'XGBoost': {'n_estimators': 10000,
   'learning_rate': 0.1,
   'n_jobs': -1,
   'proc.max_category_levels': 100,
   'objective': 'binary:logistic',
   'booster': 'gbtree'},
  'LightGBMLarge': {'learning_rate': 0.03,
   'num_leaves': 128,
   'feature_fraction': 0.9,
   'min_data_in_leaf': 5},
  'WeightedEnsemble_L2': {'use_orig_features': False,
   'max_base_models': 25,
   'max_base_models_per_type': 5,
   'save_bag_folds': True}},
 'leaderboard':                   model  score_val  pred_time_val  fit_time  \
 0            LightGBMXT        1.0       0.000718  0.285295   
 1              LightGBM        1.0       0.000762  0.281390   
 2         LightGBMLarge        1.0       0.000781  0.275127   
 3              CatBoost        1.0       0.000834  0.086406   
 4               XGBoost        1.0       0.001617  0.040825   
 5      RandomForestEntr        1.0       0.017598  0.241371   
 6        ExtraTreesEntr        1.0       0.018220  0.240957   
 7        ExtraTreesGini        1.0       0.018498  0.236784   
 8   WeightedEnsemble_L2        1.0       0.018856  0.488569   
 9      RandomForestGini        1.0       0.019265  0.241472   
 10      NeuralNetFastAI        0.0       0.003558  0.081848   
 
     pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
 0                 0.000718           0.285295            1       True   
 1                 0.000762           0.281390            1       True   
 2                 0.000781           0.275127            1       True   
 3                 0.000834           0.086406            1       True   
 4                 0.001617           0.040825            1       True   
 5                 0.017598           0.241371            1       True   
 6                 0.018220           0.240957            1       True   
 7                 0.018498           0.236784            1       True   
 8                 0.000358           0.251786            2       True   
 9                 0.019265           0.241472            1       True   
 10                0.003558           0.081848            1       True   
 
     fit_order  
 0           1  
 1           2  
 2          10  
 3           5  
 4           9  
 5           4  
 6           7  
 7           6  
 8          11  
 9           3  
 10          8  }
autogluon_amt에서 필요한 것
def autogluon_amt(fraudTrain,):
    fraudTrain = fraudTrain[['amt','is_fraud']]
    model = []
    time_diff = []
    acc = []
    pre = []
    rec = []
    f1 = [] 
    auc = [] 
    graph_based = []
    pyod = [] 
    train_size = []
    train_cols = []
    train_frate = []
    test_size = []
    test_frate = []
    hyper_params = [] 
    for name, predictor in predictors.items():
        t1 = time.time()
        predictor.fit(X,y)
        t2 = time.time()
        yyhat = predictor.predict(XX)
        scores = evaluate(yy,yyhat)
        model.append(name)
        time_diff.append(t2-t1)
        acc.append(scores['acc'])
        pre.append(scores['pre'])
        rec.append(scores['rec'])
        f1.append(scores['f1'])
        auc.append(scores['auc'])
        graph_based.append(False)
        pyod.append(True)
        train_size.append(len(y)),
        train_cols.append(list(X.columns)),
        train_frate.append(np.array(y).reshape(-1).mean()),
        test_size.append(len(yy)),
        test_frate.append(np.array(yy).reshape(-1).mean())
        hyper_params.append(None)
    df_results = pd.DataFrame(dict(
        model = model,
        time=time_diff,
        acc=acc,
        pre=pre,
        rec=rec,
        f1=f1,
        auc=auc,
        graph_based = graph_based,
        pyod = pyod,
        throw_rate = throw_rate,
        train_size = train_size,
        train_cols = train_cols,
        train_frate = train_frate,
        test_size = test_size,
        test_frate = test_frate,
        hyper_params = hyper_params
    ))
    ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') 
    df_results.to_csv(f'./results/{ymdhms}-pyod.csv',index=False)
    return df_results
def throw(df, fraud_rate, random_state=42):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=random_state)
    df_p = pd.concat([df1, df0_down])
    return df_p
import pickle
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
test = throw(fraudTrain, 0.3)
test.is_fraud.mean()
0.3
fraudTrain.is_fraud.mean()
0.005727773406766326